# Creates a list containing testing and training dataframes
createTraining <- function(data, seed = 123, trainPercent = 0.8) {
set.seed(seed)
n <- nrow(data)
numTrain <- floor(trainPercent * n)
trainingRows <- sample(1:n, size = numTrain, replace = FALSE)
trainingData <- data[trainingRows, ]
testingData <- data[-trainingRows, ]
return(list(training = trainingData, testing = testingData))
}
# Creates confidence and prediction intervals
jjIntervals <- function(data, model) {
confidence <- as.data.frame(predict.lm(model, newdata = data, interval = "confidence")) %>%
rename(confLwr = lwr, confUpr = upr)
prediction <- as.data.frame(predict.lm(model, newdata = data, interval = "prediction")) %>%
rename(predictLwr = lwr, predictUpr = upr) %>%
select(predictLwr, predictUpr)
intervalData <- cbind(data,confidence,prediction)
return(intervalData)
}
# Creates a density plot given parameters
jjplotDensity <- function(data,x,fill,color) {
plot <- ggplot(data, aes(x={{x}})) +
geom_density(aes(fill={{fill}}), alpha=0.4)+
geom_rug(aes(color={{color}}), y=0) +
theme_custom() +
theme(legend.position = "none")
return(plot)
}
# Creates a boxplot given parameters
jjplotBoxplot <- function(data,x,y,fill) {
plot <- ggplot(data=data, aes(x = {{x}}, y = {{y}}, fill = {{fill}})) +
geom_boxplot() +
coord_flip() +
theme_custom() +
theme(legend.position = "none")
return(plot)
}
# Creates a scatter plot
jjplotPoint <- function(data,x,y,color, model) {
data <- jjIntervals(data,model)
plot <- ggplot(data=data, aes(x = {{x}}, y = {{y}}, color = {{color}})) +
geom_point() +
geom_ribbon(aes(ymin = 10^confLwr, ymax = 10^confUpr), fill = "yellow", alpha = 0.4) +
geom_line(aes(y = 10^fit), color = "#3366FF", size = 0.75) +
geom_line(aes(y = 10^confLwr), linetype = "dashed", size = 0.75) +
geom_line(aes(y = 10^confUpr), linetype = "dashed", size = 0.75) +
geom_line(aes(y = 10^predictLwr), linetype = "dashed", color = "red", size = 0.75) +
geom_line(aes(y = 10^predictUpr), linetype = "dashed", color = "red", size = 0.75) +
theme_custom()
return(plot)
}
# Checks a model and gives back error and p-value
checkModel <- function(data, matrix) {
n <- nrow(data)
error <- (matrix[1,2] + matrix[2,1])/n
pHat <- (matrix[2,1]+matrix[2,2])/n
standardError <- sqrt(pHat*(1-pHat)/n)
pValue <- pnorm(error,pHat,standardError)
return(list("error" = error,"pValue" = pValue))
}
# full data set
recid <- read.csv("datasets/Project3Sample4000.csv")
# mystery project data
recidMysteryBox <- read.csv("datasets/Project3Mystery100.csv")
Recidivism is a term used within the criminal justice system which means “the tendency of a criminal to reoffend after serving a sentence in a disciplinary institution.” The data we will be analyzing is from Broward County, Florida and includes recidivism predictions from the COMPAS test given to inmates. The goal of this analysis is to, based on various factors of an inmate, predict whether or not they will reoffend within two years of being released.
By the end of this analysis, we hope to have an accurate classification model for whether or not a person is likely to reoffend as well as have the ability to discuss the accuracy of the model in detail.
Finally, we hope to understand the ethical implications of the model we make and to know how to mitigate and/or measure the biases held by the model itself.
This task revolves around visualizing the data and making the data we are given usable. We clean the full data set found in the Project3Sample4000.csv file. This includes data cleaning, feature engineering, and data refining along with the creation of a testing training split.
## Data Cleaning
recid2 <- recid %>%
rename(
dayBefScreenArrest = days_b_screening_arrest,
jailIn = c_jail_in,
jailOut = c_jail_out,
daysFromCompas = c_days_from_compas,
chargeDegree = c_charge_degree,
chargeDesc = c_charge_desc,
riskRecidDecileScore = RiskRecidDecileScore,
riskRecidScoreLevel = RiskRecidScoreLevel,
riskRecidScreeningDate = RiskRecidScreeningDate,
riskViolenceDecileScore = RiskViolenceDecileScore,
riskViolenceScoreLevel = RiskViolenceScoreLevel
) %>%
mutate(
dob = as_date(dmy(dob)),
ageCat = as.factor(ageCat),
race = as.factor(race),
jailIn = as.Date(dmy_hm(jailIn, tz = "EST")),
jailOut = as.Date(dmy_hm(jailOut, tz = "EST")),
chargeDegree = as.factor(gsub("[()]","",chargeDegree)),
riskRecidScoreLevel = as.factor(riskRecidScoreLevel),
riskRecidScreeningDate = as_date(dmy(riskRecidScreeningDate))
)
## Data Engineering
recid3 <- recid2 %>%
mutate(
daysInJail = as.numeric(difftime(jailOut,jailIn,unit="days")+1),
logDaysInJail = log10(daysInJail),
logPriorsCount = log10(priorsCount+0.1),
juvCount = juvFelonyCount + juvMisdemeanerCount + juvOtherCount,
logJuvCount = log10(juvCount+0.1)
)
## Data Removal
recid4 <- recid3 %>%
select (
-name,
-dob,
-race
)
## Testing Training Split
testingTraining <- createTraining(recid4, seed=8675309)
recidTraining <- testingTraining$training
recidTesting <- testingTraining$testing
In our data cleaning, we forced date variables to be dates and factor variables to be factors. Then we engineered a few categories in the data - some for convenience and some for purpose. These include the following: daysInJail (difference between entry and exit of jail), logDaysInJail (log base 10 of daysInJail), logPriorsCount (log base 10 of priorsCount), juvCount (total of all juvenile crime categories), and logJuvCount (log base 10 of juvCount). We then remove name, dob, and race because name is irrelevant to recidivism, dob is covered by the included age category, and race is not fair to include in a predictive model as there is no definitive difference aside from visually between two people of different races.
In this section we will be creating three different models based on the data set given and now refined. Our first model will be a logistic model to predict whether or not an inmate will reoffend within two years of their release. Our second model will predict the risk of recidivism score. This score was given by a written test given to prisoners and was computed by a third-party company that uses a black-box algorithm to compute the value of riskRecidScoreLevel. Our third model will predict the violence score of an inmate. This was once more calculated from a test administered to inmates and was computed by a third-party company using a black-box algorithm.
This is our first model. It is a logistic regression model which predicts whether or not an inmate will reoffend within two years of being released. The following plots are some data visualization relevant to this model.
### DaysInJail Plot
p1 <- recidTraining %>%
jjplotDensity(x = daysInJail, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title="Days in Jail",
x = "Days in Jail"
)
p2 <- recidTraining %>%
jjplotDensity(x = logDaysInJail, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title="Log10 of Days in Jail",
x = "log10(daysInJail)"
)
p3 <- recidTraining %>%
jjplotBoxplot(x = isRecid, y=daysInJail, fill=as.factor(isRecid)) +
labs(
title="Days in Jail",
y = "Days in Jail",
x = "Reoffence Prediction Proportion"
)
p4 <- recidTraining %>%
jjplotBoxplot(x = isRecid, y=logDaysInJail, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0")) +
labs(
title="Log Base 10 of Days in Jail",
y = "log10(daysInJail)",
x = "Reoffence Prediction Proportion",
fill = "Reoffence Prediction Proportion"
)
p1 + p2 + p3 + p4 +
plot_annotation(
title = "Days in Jail and log10(Days in Jail)",
theme=theme_custom()
) + plot_layout(guides = 'collect')
Figure 1 illustrates that a higher proportion of inmates who spent less time in prison when compared to the proportion of prisoners likely to reoffend who spent a longer duration in jail. This is shown more clearly in the right two plots as the left two plots are so heavily skewed left that they are not very readable. The left two plots are included to demonstrate that taking the log base 10 of daysInJail eliminates much of the leftward skew therefore being a more sensitive predictor to be included within a model than simply daysInJail.
### Priors Count
p5 <- recidTraining %>%
jjplotDensity(x = priorsCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title="Priors Counts",
x = "Priors Counts"
)
p6 <- recidTraining %>%
jjplotDensity(x = logPriorsCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title="log10(Priors Counts + 0.1)",
x = "log10(Priors Counts + 0.1)"
)
p7 <- recidTraining %>%
jjplotBoxplot(x = isRecid, y=priorsCount, fill=as.factor(isRecid)) +
labs(
title="Priors Counts",
y = "Priors Counts",
x = "Recidivated",
fill = "Recidivated"
)
p8 <- recidTraining %>%
jjplotBoxplot(x = isRecid, y=logPriorsCount, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0")) +
theme(legend.position = "right") +
labs(
title="log10(Priors Counts + 0.1)",
y = "log10(Priors Counts + 0.1)",
x = "Recidivated",
fill = "Recidivated"
)
p5 + p6 + p7 + p8 + plot_annotation(title = "Priors Counts", theme=theme_custom()) + plot_layout(guides = 'collect')
Figure 2 illustrates that taking the log base 10 of the number of prior offenses (plus 0.1 to avoid taking the log10 of 0) improves the sensitivity of the predictions made with that variable as well as reducing the number of outliers included in the data which means the model will better predict whether or not an inmate will reoffend after being released. For this reason, we will be using the log base 10 of prior count as opposed to just priorCount.
### Juvenile Priors Count
p9 <- recidTraining %>%
jjplotDensity(x = juvCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title="Juvenile Priors Counts",
x = "Juvenile Priors Counts"
)
p10 <- recidTraining %>%
jjplotDensity(x = logJuvCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title="log10(Juvenile Priors Counts + 0.1)",
x = "log10(Juvenile Priors Counts + 0.1)"
)
p11 <- recidTraining %>%
jjplotBoxplot(x = isRecid, y=juvCount, fill=as.factor(isRecid)) +
labs(
title="Juvenile Priors Counts",
y = "Juvenile Priors Counts",
x = "Recidivated",
fill = "Recidivated"
)
p12 <- recidTraining %>%
jjplotBoxplot(x = isRecid, y=logJuvCount, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0")) +
theme(legend.position = "right") +
labs(
title="log10(Juvenile Priors Counts + 0.1)",
y = "log10(Juvenile Priors Counts + 0.1)",
x = "Recidivated",
fill = "Recidivated"
)
p9 + p10 + p11 + p12 + plot_annotation(title = "Juvenile Priors Counts", theme=theme_custom()) + plot_layout(guides = 'collect')
Figure 3 shows that we can rule out prior crimes committed in juvenile years. There was not enough relevant data in these plots to include the variable or the log base 10 of the variable; therefore, this plot is important to include as a justification of refining our model later on. This is one of the few instances where we could simply rule out a variable this early in the process.
### Age
p13 <- recidTraining %>%
jjplotDensity(x = age, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title="Age",
x = "Juvenile Priors Counts"
)
p14 <- recidTraining %>%
jjplotBoxplot(x = isRecid, y=age, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0")) +
theme(legend.position = "right") +
labs(
title="Age",
x = "Age",
y = "Recidivated",
fill = "Recidivated"
)
p13 / p14 + plot_annotation(theme=theme_custom())
Figure 4 shows that age is a factor in whether or not a person will reoffend. Shown by the density and boxplots, we can determine that in this data, there is a higher proportion of reoffenders in younger populations and a higher proportion of non-reoffenders in older populations. The age cutoff in the trend is 35 years old. This informs us that we could potentially use age as a predictor in the model to predict recidivism.
### Sex
ggplot(data=recidTraining,aes(x=sex, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0"))) +
geom_bar(position = "dodge") +
labs(
title="Sex",
x = "Sex",
fill = "Recidivated"
) +
theme_custom()
Figure 5 takes into account sex as a factor determining reoffending rates. This plot, regarding reoffending rates between male and female populations, shows us that men have a higher probability in relation to male populations to reoffend than women have in relation to the population of women. This means that this variable would most likely be useful to the model; however, sex along with race will not be included for ethical reasons as it is not ethical or fair to judge someone more harshly based on their sex or race.
### ChargeDegree
ggplot(data=recidTraining,aes(x=chargeDegree, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0"))) +
geom_bar(position = "dodge") +
labs(
title="Charge Degree",
x = "Charge Degree",
fill = "Recidivated"
) +
theme_custom()
Figure 6 illustrates the difference in recidivism rate based on charge degree. Despite these being ordered properly from most severe crime to least severe crime, there does not seem to be a consistent pattern by eye. This means we can leave this variable in the initial model but it may be refined out later.
### Colinearity Check
p15 <- ggplot(recidTraining, aes(x = logDaysInJail, y = logPriorsCount, color = fct_recode(as.factor(isRecid),Yes = "1", No = "0"))) +
geom_point() +
labs(
title="log10(Days In Jail) vs log10(Priors Count)",
x = "log10(Days In Jail)",
y = "log10(Priors Count)",
color = "Recidivated"
) +
theme_custom()
p16 <- ggplot(recidTraining, aes(x = logDaysInJail, y = age, color = fct_recode(as.factor(isRecid),Yes = "1", No = "0"))) +
geom_point() +
labs(
title="log10(Days In Jail) vs Age",
x = "log10(Days In Jail)",
y = "Age",
color = "Recidivated"
) +
theme_custom()
p17 <- ggplot(recidTraining, aes(x = logPriorsCount, y = age, color = fct_recode(as.factor(isRecid),Yes = "1", No = "0"))) +
geom_point() +
labs(
title="log10(Priors Count) vs Age",
x = "log10(Priors Count)",
y = "Age",
color = "Recidivated"
) +
theme_custom()
p15 / (p16 + p17) + plot_annotation(title = "Colinearity Check", theme=theme_custom()) + plot_layout(guides = 'collect')
Figure 7 shows us that none of the three variables (age, logDaysInJail, and logPriorsCount) are colinear. The three subplots in conjunction show us that each of these three variables is not colinear with any of the others, and, therefore, all of them may be included in the model to potentially be refined out later.